####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent 
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”, 
## by Carolina Bernal, Donald P. Green, and David Vilalta
## Script author: Carolina Bernal
## What for: Run simulations to check whether inputting zeroes to
## prizes and expenditure in the last 3 quarters of the year biases estimation
####################################################################################

rm(list=ls())       

library(dplyr)
library(ggplot2)
library(tidyr)
library(lfe)
library(fixest)

data <- read.csv("Data/Our_data/20240814_SurveyData_for_simulations.csv", header=TRUE)

Sys.time()

data$imputed <- ifelse(data$month <= 3, 0 , 1)

library(dplyr)

# Define the number of simulations
n_sims <- 10000 # Be aware that running the 10K sims take several hours. 

# 1) Discretize (Bin) the Continuous `expenditure_all` Variable
# Discretize expenditure_all into bins (e.g., using quantiles)
data <- data %>%
  mutate(expenditure_bin = cut(expenditure_all, 
                               breaks = quantile(expenditure_all, probs = seq(0, 1, by = 0.2), na.rm = TRUE),
                               include.lowest = TRUE, 
                               labels = FALSE))


# 2) Function to conduct randomization inference with a clustered, continuous treatment and conditional assignment
run_randomization_inference_clustered <- function(data) {
  
  # Function to calculate the ATE for a given shuffled treatment
  calc_ate <- function(data) {
    # Fit a linear model to estimate the treatment effect controlling for expenditure_all
    model <-     feols(
      vote_incumbent ~ top_prizes_gdp + expenditure_all + as.factor(municipality_size) + as.factor(female) + as.factor(education) + as.factor(status) + age | fixed_effect + survey,
      cluster = c("province"),
      data = data)
    ate <- coef(model)["top_prizes_gdp"]
    return(ate)
  }
  
  # Run the randomization inference
  sampling_distribution <- replicate(n_sims, {
    # Shuffle treatment at the province level within expenditure bins
    shuffled_data <- data %>%
      group_by(expenditure_bin, province) %>%
      mutate(top_prizes_gdp = sample(top_prizes_gdp, size = n(), replace = FALSE)) %>%
      ungroup()
    
    calc_ate(shuffled_data)
  })
  
  return(sampling_distribution)
}


set.seed(123456)

# 3) Randomization inference with all data (including imputed zeroes)
sampling_distribution_with_imputed <- run_randomization_inference_clustered(data)

# 4) Randomization inference without imputed zeroes
data_no_imputed <- data %>% filter(imputed == 0)
sampling_distribution_without_imputed <- run_randomization_inference_clustered(data_no_imputed)

# 5) Analyze the sampling distributions
mean_with_imputed <- mean(sampling_distribution_with_imputed)
mean_without_imputed <- mean(sampling_distribution_without_imputed)

cat("Mean of sampling distribution with imputed zeroes:", mean_with_imputed, "\n")
cat("Mean of sampling distribution without imputed zeroes:", mean_without_imputed, "\n")

# Check the number of unique values in the sampling distributions
cat("Number of unique values in sampling_distribution_with_imputed:", length(unique(sampling_distribution_with_imputed)), "\n")
cat("Number of unique values in sampling_distribution_without_imputed:", length(unique(sampling_distribution_without_imputed)), "\n")

# 6) Plot the sampling distributions separately using ggplot2 colors
# Convert the sampling distributions into data frames for ggplot2
hist_data_with_imputed <- data.frame(ATE = sampling_distribution_with_imputed)
hist_data_without_imputed <- data.frame(ATE = sampling_distribution_without_imputed)


### PLOTS:
# 7) Histogram for 'sampling_distribution_with_imputed'
ggplot(hist_data_with_imputed, aes(x = ATE)) +
  geom_histogram(fill = "blue", bins = 20, alpha = 0.5) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Sampling Distribution with Imputed Zeroes",
       x = "ATE",
       y = "Frequency") +
  theme_minimal()

# 8) Histogram for 'sampling_distribution_without_imputed'
ggplot(hist_data_without_imputed, aes(x = ATE)) +
  geom_histogram(fill = "green", bins = 20, alpha = 0.5) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Sampling Distribution without Imputed Zeroes",
       x = "ATE",
       y = "Frequency") +
  theme_minimal()

# 9) Combine the two histograms in a single plot 
# Create a data frame for combined histograms
hist_data <- data.frame(
  ATE = c(sampling_distribution_with_imputed, sampling_distribution_without_imputed),
  Group = c(rep("With Imputed Zeroes", length(sampling_distribution_with_imputed)),
            rep("Without Imputed Zeroes", length(sampling_distribution_without_imputed)))
)

# Plot combined histograms
ggplot(hist_data, aes(x = ATE, fill = Group)) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
  scale_fill_manual(values = c("blue", "green")) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Comparison of Sampling Distributions",
       x = "ATE",
       y = "Frequency",
       fill = "Data") +
  theme_minimal() +
  theme(legend.position = "bottom")  # Move legend below



# Check the size of the filtered dataset
cat("Number of observations in data_no_imputed:", nrow(data_no_imputed), "\n")

# Check the variability of the treatment in the filtered dataset
summary(data_no_imputed$top_prizes_gdp)
table(data_no_imputed$top_prizes_gdp)

# Check for variability within the expenditure bins and provinces
table(data_no_imputed$expenditure_bin, data_no_imputed$province)

# Calculate standard error
se_with_imputed <- sd(sampling_distribution_with_imputed) / sqrt(length(sampling_distribution_with_imputed))
se_without_imputed <- sd(sampling_distribution_without_imputed) / sqrt(length(sampling_distribution_without_imputed))

# Calculate 95% confidence intervals
ci_with_imputed <- mean(sampling_distribution_with_imputed) + c(-1.96, 1.96) * se_with_imputed
ci_without_imputed <- mean(sampling_distribution_without_imputed) + c(-1.96, 1.96) * se_without_imputed

# Output the results to build Table 13 in the Appendix: 
cat("Mean of sampling distribution with imputed zeroes:", mean_with_imputed, "\n")
cat("Mean of sampling distribution without imputed zeroes:", mean_without_imputed, "\n")

cat("Standard Error with imputed zeroes:", se_with_imputed, "\n")
cat("95% Confidence Interval with imputed zeroes:", ci_with_imputed, "\n")
cat("Does CI with imputed zeroes exclude zero?", !between(0, ci_with_imputed[1], ci_with_imputed[2]), "\n\n")

cat("Standard Error without imputed zeroes:", se_without_imputed, "\n")
cat("95% Confidence Interval without imputed zeroes:", ci_without_imputed, "\n")
cat("Does CI without imputed zeroes exclude zero?", !between(0, ci_without_imputed[1], ci_without_imputed[2]), "\n")

save.image(file='Data/Simulation_AllCtrols_10K.RData')

Sys.time()
# load('Data/Simulation_AllCtrols_10K.RData')  ## load this if you want to check the simulation results but have no time to run it yourself

savehistory("Appendix_SurveyEvidence_Figures.Rhistory")
